# load libraries ----------------------------------------------------------

library(tidyverse)


# download original files -------------------------------------------------

temp <- tempfile()

download.file(
        "https://www.bundeswahlleiter.de/en/dam/jcr/ce2d2b6a-f211-4355-8eea-355c98cd4e47/btw_kerg.zip",
        temp
)
unzip(zipfile = temp, exdir = "./data")

unlink(temp)


# list the relevant files -------------------------------------------------

# all files
list_all <- list.files(path = "data/")

# relevant files
list_relevant <- list_all[!grepl("kerg2", list_all)]


# import secondary data ---------------------------------------------------

load("03_compile_names.RData")


# 1949 --------------------------------------------------------------------

# guess the file encoding
guessing <- guess_encoding("data/btw1949_kerg.csv")[1, 1][[1]]
encoding <- if_else(guessing == "UTF-8", "UTF-8", "latin1")

# find out the number of lines to skip
skip = sapply("data/btw1949_kerg.csv",
              function(x)
                      min(grep("^[WN]", readLines(x, n = 10)) - 1))

df_49 <-
        read.csv(
                "data/btw1949_kerg.csv",
                encoding = encoding,
                skip = skip,
                sep = ";",
        ) |>
        select(-Land,-Wähler,-Gültige,-starts_with("X")) |>
        rename(constituency_no = 1,
               electorate = 2) |>
        filter(str_detect(constituency_no, "^[012]")) |>
        mutate(election = 1949) |>
        relocate(election) |>
        pivot_longer(
                cols = 4:last_col(),
                names_to = "party",
                values_to = "first_vote",
                values_transform = list(values = as.integer)
        ) |>
        mutate(second_vote = NA_real_)


# 1953:2002 ---------------------------------------------------------------

list_5302 <-
        parse_number(list_relevant)[between(parse_number(list_relevant), 1953, 2002)]

the_list <- list()

for (i in 1:length(list_5302)) {
        # guess the file encoding
        guessing <-
                guess_encoding(paste0("data/btw", list_5302[i], "_kerg.csv"))[1, 1][[1]]
        encoding <- if_else(guessing == "UTF-8", "UTF-8", "latin1")
        
        # find out the number of lines to skip
        skip <-
                sapply(paste0("data/btw", list_5302[i], "_kerg.csv"),
                       function(x)
                               min(grep("^[WN]", readLines(x, n = 10)) - 1))
        
        # read the data
        df_temp <-
                read.csv(
                        paste0("data/btw", list_5302[i], "_kerg.csv"),
                        encoding = encoding,
                        skip = skip,
                        sep = ";"
                )  |>
                select(-Land,-starts_with("X"), starts_with("X5"))
        
        # merge top two lines to create variable names
        colnames(df_temp) <- paste(sep = '_', colnames(df_temp),
                                   as.character(unlist(df_temp[1,])))
        
        df_temp <- df_temp |>
                rename(constituency_no = 1,
                       electorate = 2) |>
                filter(
                        str_detect(constituency_no, "[:digit:]") &
                                as.numeric(constituency_no) <= 328
                ) |>
                mutate(election = list_5302[i]) |>
                relocate(election) |>
                pivot_longer(
                        cols = 4:last_col(),
                        names_to = "party",
                        values_to = "values",
                        values_transform = list(values = as.integer)
                ) |>
                filter(!str_detect(party, "^Gültige") &
                               !str_detect(party, "^Wähler")) |>
                mutate(
                        votes = case_when(
                                str_detect(party, "_Erststimmen") ~ "first_vote",
                                str_detect(party, "_Zweitstimmen") ~ "second_vote"
                        ),
                        party = str_remove(party, "_Erststimmen"),
                        party = str_remove(party, ".1_Zweitstimmen"),
                        party = str_remove(party, "_Zweitstimmen"),
                ) |>
                pivot_wider(names_from = votes, values_from = values)
        
        # add to the empty list
        the_list[[i]] <- df_temp
        
}

df_5302 <- do.call(rbind, the_list)


# 2005:2021 ---------------------------------------------------------------

list_0521 <-
        parse_number(list_relevant)[between(parse_number(list_relevant), 2005, 2021)]

the_list <- list()

for (i in 1:length(list_0521)) {
        # guess the file encoding
        guessing <-
                guess_encoding(paste0("data/btw", list_0521[i], "_kerg.csv"))[1, 1][[1]]
        encoding <- if_else(guessing == "UTF-8", "UTF-8", "latin1")
        
        # find out the number of lines to skip
        skip <-
                sapply(paste0("data/btw", list_0521[i], "_kerg.csv"),
                       function(x)
                               min(grep("^[WN]", readLines(x, n = 10)) - 1))
        
        # read the data
        df_temp <-
                read.csv(
                        paste0("data/btw", list_0521[i], "_kerg.csv"),
                        encoding = encoding,
                        skip = skip,
                        sep = ";",
                ) |>
                select(-2)
        
        # get ready to fill empty headers
        fill_colnames <- names(df_temp) |>
                as_tibble() |>
                mutate(value = case_when(
                        value == "X" ~ NA_character_,
                        str_detect(value, "^[X]\\.[0-9]") ~ NA_character_,
                        TRUE ~ value
                )) |>
                fill(value)
        
        # merge headers with top three lines to create variable names
        colnames(df_temp) <- paste(sep = '_',
                                   fill_colnames$value,
                                   as.character(unlist(df_temp[1,])),
                                   as.character(unlist(df_temp[2,])))
        
        # remove any trailing columns
        df_temp <- df_temp[, 1:nrow(fill_colnames)]
        
        df_temp <- df_temp |>
                select(-contains("Vorperiode")) |>
                rename(
                        constituency_no = 1,
                        state_no = 2,
                        electorate = 3
                ) |>
                filter(
                        !is.na(constituency_no) &
                                constituency_no != "999" &
                                constituency_no < 900 &
                                state_no < 99
                ) |>
                mutate(election = list_0521[i]) |>
                relocate(election) |>
                select(-state_no) |>
                pivot_longer(
                        cols = 4:last_col(),
                        names_to = "party",
                        values_to = "values",
                        values_transform = list(values = as.integer)
                ) |>
                filter(
                        !str_detect(party, "^Gültige") & !str_detect(party, "^Wähler") &
                                !str_detect(party, "^Wahlberechtigte") &
                                !str_detect(party, "^Wählende")
                ) |>
                mutate(
                        votes = case_when(
                                str_detect(party, "_Erststimmen") ~ "first_vote",
                                str_detect(party, "_Zweitstimmen") ~ "second_vote"
                        ),
                        party = str_remove(party, "_Erststimmen_Endgültig"),
                        party = str_remove(party, "_Zweitstimmen_Endgültig")
                ) |>
                pivot_wider(names_from = votes, values_from = values)
        
        # add to the empty list
        the_list[[i]] <- df_temp
        
}

df_0521 <- do.call(rbind, the_list) |>
        select(-"NA")


# merge and tidy ----------------------------------------------------------

df <- rbind(df_49, df_5302, df_0521) |>
        mutate(
                electorate = as.integer(electorate),
                constituency_no = as.numeric(constituency_no),
                first_vote = replace(first_vote, first_vote == 0, NA_real_),
                second_vote = replace(second_vote, second_vote == 0, NA_real_)
        ) |>
        filter(!(is.na(first_vote) & is.na(second_vote))) |>
        
        left_join(
                df_constituencies,
                by = c("election" = "election",
                       "constituency_no" = "constituency_no")
        ) |>
        left_join(df_parties, by = c("election" = "election",
                                     "party" = "party")) |>
        group_by(election, constituency_no) |>
        mutate(
                turnout = sum(first_vote, na.rm = TRUE),
                turnout_percent = round(sum(first_vote, na.rm = TRUE) * 100 / electorate, 1),
                first_vote_percent = if_else(
                        party_abbrev == "_invalid",
                        round(first_vote * 100 / sum(first_vote, na.rm = TRUE), 1),
                        round(first_vote * 100 / sum(first_vote[party_abbrev != "_invalid"], na.rm = TRUE), 1)
                ),
                second_vote_percent = if_else(
                        party_abbrev == "_invalid",
                        round(second_vote * 100 / sum(second_vote, na.rm = TRUE), 1),
                        round(second_vote * 100 / sum(second_vote[party_abbrev != "_invalid"], na.rm = TRUE), 1)
                )
        ) |>
        ungroup() |>
        mutate(rank_helper = case_when(party_abbrev == "_other" ~ 1,
                                       party_abbrev == "_invalid" ~ 2,
                                       TRUE ~ 0)) |>
        arrange(party_name) |>
        arrange(desc(first_vote)) |>
        arrange(rank_helper) |>
        arrange(constituency_no) |>
        arrange(election) |>
        mutate(row_id = row_number()) |>
        relocate(
                row_id,
                election,
                state_no,
                state_abbrev,
                state_name,
                constituency_no,
                constituency_name,
                electorate,
                turnout,
                turnout_percent,
                party_abbrev,
                party_name,
                first_vote,
                first_vote_percent,
                second_vote,
                second_vote_percent
        ) |>
        select(-party,-rank_helper)


# save --------------------------------------------------------------------

write.table(
        df,
        "01_bundestag_elections.tab",
        quote = FALSE,
        sep = "\t",
        row.names = FALSE,
        na = ""
)